
set rmsg on
set more off

************************************************************************
*
*	JOB:		Format sample and generate groups for estimation
*	PROJECT:	Understanding cash flow shocks
*	INPUT:		Compustat and other files
*	OUTPUT:		Sample 
*
*	DESCRIPTION: This job creates a panel data set with compustat data
*				 at the annual frequency
*
*************************************************************************

capture program drop _all
clear
clear matrix


* Define paths

	global 	data	"..."
		
	
*********************************
**** Begin: Utility Programs ****
*********************************

  **** Winsor program (winsor varname [1,5])
  capture program drop winsor
  program define winsor
    quiet sum `1', detail
    replace `1' = r(p1)  if `1' ~= . & (`1' < r(p1))  & `2' == 1
    replace `1' = r(p99) if `1' ~= . & (`1' > r(p99)) & `2' == 1
    replace `1' = r(p5)  if `1' ~= . & (`1' < r(p5))  & `2' == 5
    replace `1' = r(p95) if `1' ~= . & (`1' > r(p95)) & `2' == 5
  end
  
*******************************
**** End: Utility Programs ****
*******************************

************************************************************
**** Begin: Extract and Format of annual Compustat data ****
************************************************************
  
**** Load annual compustat data
**** This step loads Compustat data downloaded from WRDS in Stata format
		
********************************		
**** Begin sample selection
********************************
		
	use	gvkey cusip tic sic sich conm datadate fyear fyr indfmt datafmt popsrc consol curcd fic loc scf emp 		/*
	*/	ch che at ppent ppegt sale dltt dlc csho prcc_f capx xrd ib dp oibdp ceq prstkc xint txc txt dvc dvp/*
	*/ 	ivch aqc fuseo sppe siv ivstch ivaco chech dv dltis dltr dlcch sstk scstkc spstkc wcapc wcap recch invch apalch txach aoloch fiao /*
	*/	ibc xidoc dpc txdc esubc sppiv fopo fsrco exre dp act lct re pi	rect invt xsga cogs ni /*
	*/  using "$data/Compustat_NA_Annual_1965_2019.dta", clear 
	
	destring gvkey, replace
	destring sic, replace
	sort gvkey fyear
	
	* Drop duplicate observations based on gvkey and fiscal year
	duplicates list gvkey fyear
	duplicates drop gvkey fyear, force
		
**** Create calendar date, year and month variables
	g       date = .
	replace date = mdy(12,31,fyear) 	if fyr == 12
	replace date = mdy(6,30,fyear) 		if fyr == 6
	replace date = mdy(5,31,fyear+1) 	if fyr == 5
	replace date = mdy(9,30,fyear) 		if fyr == 9
	replace date = mdy(8,31,fyear) 		if fyr == 8
	replace date = mdy(10,31,fyear) 	if fyr == 10
	replace date = mdy(7,31,fyear) 		if fyr == 7
	replace date = mdy(11,30,fyear) 	if fyr == 11
	replace date = mdy(3,31,fyear+1) 	if fyr == 3
	replace date = mdy(4,30,fyear+1) 	if fyr == 4
	replace date = mdy(1,31,fyear+1) 	if fyr == 1
	replace date = mdy(2,29,fyear+1) 	if fyr == 2 & mod(fyear-1903,4) == 0
	replace date = mdy(2,28,fyear+1) 	if fyr == 2 & mod(fyear-1903,4) ~= 0
  	
	g year	= year(date)
  	g month	= month(date)
	
	
**** Merge with GDP deflator data
**** This step merges data set with GDP deflator, downloaded from FRED St. Louis
	sort  year
	merge m:1 year using "$data/GDPdeflator_2000.dta"
	drop  if _merge == 2
	drop  _merge
	
	g at_nom = at 
	g td_nom = dltt+dlc
	
	g prc_nom = prcc_f
	
**** Convert all variables into 2000 constant dollars	
	local vlist 	ch che at ppent ppegt sale dltt dlc csho prcc_f capx xrd ib dp oibdp ceq prstkc xint txc txt dvc dvp/*
					*/ 	ivch aqc fuseo sppe siv ivstch ivaco chech dv dltis dltr dlcch sstk scstkc spstkc wcapc wcap recch invch apalch txach aoloch fiao /*
					*/	ibc xidoc dpc txdc esubc sppiv fopo fsrco exre dp act lct re pi	rect invt xsga cogs ni
	foreach v of local vlist {
		replace `v' = `v'/gdpdeflator2000
		}		
		
	
	**** Apply various screens
	keep if (fyear>=1970)
	keep if (fyear<=2018)
		
	* Drop regulated industries
	drop if sic>=6000 & sic<7000
	drop if sic>=4900 & sic<5000
	
	* Only keep firms with valid gvkey and fiscal year
	keep if (gvkey!=.)
	keep if (fyear!=.)


	**** Full sample stats
	count
	distinct gvkey
	
	* Generate asset and sales growth variables
	sort gvkey fyear
	g atl1 		= at[_n-1] if gvkey[_n]==gvkey[_n-1] & fyear[_n]==fyear[_n-1]+1
	g salel1 	= sale[_n-1] if gvkey[_n]==gvkey[_n-1] & fyear[_n]==fyear[_n-1]+1		
	
	g 	at_growth	= (at - atl1) / atl1	
	g 	sale_growth = (sale - salel1) / salel1
	
	su at_growth, d
	su sale_growth, d
	
	**** Compute initial assets for every firm
	drop if (at == .)
	bysort gvkey: g temp = _n
	
	g temp1 = at   if temp == 1
	g temp3 = atl1 if temp == 3
	* Compute time zero assets
	egen at0 = mean(temp1), by(gvkey)	
	* Use earliest asset observation for a few firms
	egen temp4 = mean(temp3), by(gvkey)
	replace at0 = temp4 if at0 == 0 & temp4 ~= .
	drop temp1 temp3 temp4 	
	
	**** Compute average assets for every firm
	bysort gvkey: egen at_avg = mean(at)
	
	
	**** With this filter, we remove all firms that experience asset growth above 500% at some point during their life.
	g tag2 = 1 if at_growth > 5
	replace tag2 = . if at_growth == .
	
	egen tagfirm2 = mean(tag2), by(gvkey)
	replace tagfirm2 = 0 if tagfirm2 == .	
	
	drop if tagfirm2 == 1
	drop tag2 tagfirm2	

	count
	distinct gvkey
	
********************************		
**** End sample selection
********************************
		
********************************		
**** Begin variable construction
********************************
	
	sort gvkey fyear
	g	sic4 = sic
	g	sic3 = floor(sic4/10)		
	g	sic2 = floor(sic4/100)		
	g	sic1 = floor(sic4/1000)		

	* Size
	g	l_a					= ln(at)
	g	market_eq			= csho*prcc_f	
	g	l_market_eq			= ln(market_eq)
	g	mv					= market_eq + at - ceq
	g	l_sale				= ln(sale)	
		
	* Cash variables	
	g	cash_at 			= che / at
	g 	cash_atnet			= che / (at - che)
	g	cash_sales			= che / sale
	
	g 	cash_atl1 	= cash_at[_n-1] if gvkey[_n]==gvkey[_n-1] & fyear[_n]==fyear[_n-1]+1	
	g	ch_cash_at	= cash_at - cash_atl1
	drop cash_atl1
	
	g 	ch_at				= ch/at	
	
	g	ppe_at				= ppent/at
	g	td					= dltt+dlc
	g	td_at				= td/at
	g	ltd_at				= dltt/at
	g	std_at				= dlc/at	
	
	g	nd_at				= (td-che)/at
	g	td_mv				= td/mv	
	g	nd_mv				= (td-che)/mv	
	
	g	op_lev				= (xsga +cogs)/at
	g	cap_lab				= ppent/emp
	
	g chef1 	= che[_n+1] if gvkey[_n]==gvkey[_n+1] & fyear[_n]==fyear[_n+1]-1
	g chel1 	= che[_n-1] if gvkey[_n]==gvkey[_n-1] & fyear[_n]==fyear[_n-1]+1	
	
	g cash_savings = che - chel1
	g cash_savings_at = cash_savings/atl1
	
	g roa = ni/at
	
	
	* Investment variables
	g	capex_at			= capx/at
	g	capex_cap			= capx/ppent
	
	g	net_invest			= capx - dp
	g	net_invest_at		= net_invest/at
	g	net_invest_cap		= net_invest/ppent	
	
	replace xrd				= 0 if xrd == .
	g	rd_sale				= xrd/sale
	g	rd_at				= xrd/at
	
	g 	aqc_at				= aqc/at
	
	g 	div_d				= .
	replace div_d = 1 if dvc > 0
	replace div_d = 0 if dvc == 0
	replace div_d = . if dvc == .

	g tang1_at = (che + 0.715*rect + 0.547*invt + 0.535*ppent) / at
	g tang2_at = (0.715*rect + 0.547*invt + 0.535*ppent) / at
	
	g prcc_fl1 	= prcc_f[_n-1] if gvkey[_n]==gvkey[_n-1] & fyear[_n]==fyear[_n-1]+1
	
	g	stock_ret		= (prcc_f-prcc_fl1)/prcc_fl1
	g	mb				= mv/at
	g	mb_excash		= (mv-che)/(at-che)	
	g	nwc_at			= (act - lct - che)/at	
	
	
	**** Define various equity and debt issue variables
	
	* Equity and debt issue variables following McLean (2011)
	g equity_issue_at = sstk/atl1
	
	g gross_eq_issue_dummy = 1 if equity_issue_at > 0.05
	replace gross_eq_issue_dummy = 0 if gross_eq_issue_dummy == .
	replace gross_eq_issue_dummy = . if equity_issue_at == .	
	
	g net_equity_issue_at = (sstk-prstkc)/atl1	
	g net_equity_issue2_at = (sstk-dvc-dvp-prstkc)/atl1	
	
	g eq_issue_dummy = 1 if net_equity_issue2_at > 0.05
	replace eq_issue_dummy = 0 if eq_issue_dummy == .
	replace eq_issue_dummy = . if net_equity_issue2_at == .
	
	g ltdebt_issue_at = dltis/atl1
	g ltdebt_reduction_at = dltr/atl1
	g net_ltdebt_issue_at = (dltis-dltr)/atl1		
	
	g ltdebt_issue_dummy = 1 if net_ltdebt_issue_at > 0.05
	replace ltdebt_issue_dummy = 0 if ltdebt_issue_dummy == .
	replace ltdebt_issue_dummy = . if net_ltdebt_issue_at == .	

	g dlttl1 	= dltt[_n-1] if gvkey[_n]==gvkey[_n-1] & fyear[_n]==fyear[_n-1]+1		
	g dlcl1 	= dlc[_n-1] if gvkey[_n]==gvkey[_n-1] & fyear[_n]==fyear[_n-1]+1		
	g tdl1 		= td[_n-1] if gvkey[_n]==gvkey[_n-1] & fyear[_n]==fyear[_n-1]+1		

	g d_dltt_at = (dltt - dlttl1)/atl1
	g d_dlc_at = (dlc - dlcl1)/atl1
	g d_td_at = (td - tdl1)/atl1
	
	g dltt_issue_dummy = 1 if d_dltt_at > 0.05
	replace dltt_issue_dummy = 0 if dltt_issue_dummy == .
	replace dltt_issue_dummy = . if d_dltt_at == .	
	
	g dlc_issue_dummy = 1 if d_dlc_at > 0.05
	replace dlc_issue_dummy = 0 if dlc_issue_dummy == .
	replace dlc_issue_dummy = . if d_dlc_at == .	
	
	g td_issue_dummy = 1 if d_td_at > 0.05
	replace td_issue_dummy = 0 if td_issue_dummy == .
	replace td_issue_dummy = . if d_td_at == .		
	
	
	* Zscore
	g	zscore				=   1.2*((act-lct)/at)+1.4*(re/at)+3.3*(pi/at)+0.6*((prcc_f*csho)/td)+ 0.999*(sale/at)	

		
********************************		
**** End variable construction
********************************		
		
********************************		
**** Begin cash flow variable
********************************

	tsset gvkey fyear
		
	* Change in working capital (following Chang et al. 2014)
	g	d_wc1 =  wcapc if scf == 1
	g	d_wc2 = -wcapc if scf == 2
	g	d_wc3 = -wcapc if scf == 3	
	g	d_wc7 = -recch - invch - apalch - txach - aoloch - fiao if scf == 7
	egen d_wc = rowmean(d_wc1 d_wc2 d_wc3 d_wc7)
	drop d_wc1 d_wc2 d_wc3 d_wc7
	replace d_wc = 0 if d_wc == .		
		
	**** Define the main cash flow measure	
	* Cash flow measure that removes working capital (CF = EBITDA - dWC)
	g	cf5					= oibdp - d_wc
	g	cf5_at				= cf5/at
	g	cf5_atl1			= l1.cf5_at
	g	g_cf5_at 			= (cf5_at - cf5_atl1)/cf5_atl1
	
	g	cf5_at0				= cf5/at0
	g	cf5_at0l1			= l1.cf5_at0
	g	g_cf5_at0 			= (cf5_at0 - cf5_at0l1)/cf5_at0l1		
	
	g	cf5l1				= l1.cf5
	g	g_cf5 				= (cf5 - cf5l1)/cf5l1	
	
	g	cf5_atavg			= cf5/at_avg
	
	* Compute the number of cash flow observations by firm
	g	tag = 1 if cf5_at != .
	replace tag = 0 if cf5_at == .
	egen n_cf5_at = total(tag), by(gvkey)
	drop tag	
	
	g	cf5_lagat			= cf5/atl1
	g	cf5_lagatl1			= l1.cf5_lagat
	g	g_cf5_lagat 		= (cf5_lagat - cf5_lagatl1)/cf5_lagatl1
	
	* Compute the number of cash flow observations by firm
	g	tag = 1 if cf5_lagat != .
	replace tag = 0 if cf5_lagat == .
	egen n_cf5_lagat = total(tag), by(gvkey)
	drop tag	
	
	
	**** Define a second cash flow definition that does not subtract change in working capital
	* Operating income before depreciation and amortization (CF = EBITDA)
	g	cf7					= oibdp
	g	cf7_at				= cf7/at
	g	cf7_atl1			= l1.cf7_at
	g	g_cf7_at 			= (cf7_at - cf7_atl1)/cf7_atl1		

	g	cf7_at0				= cf7/at0
	g	cf7_at0l1			= l1.cf7_at0
	g	g_cf7_at0 			= (cf7_at0 - cf7_at0l1)/cf7_at0l1			
	
	g	cf7_atavg			= cf7/at_avg	
	
	* Compute the number of cash flow observations by firm	
	g	tag = 1 if cf7_at != .
	replace tag = 0 if cf7_at == .
	egen n_cf7_at = total(tag), by(gvkey)
	drop tag
	
	g	cf7_lagat			= cf7/atl1
	g	cf7_lagatl1			= l1.cf7_lagat
	g	g_cf7_lagat 		= (cf7_lagat - cf7_lagatl1)/cf7_lagatl1
	
	* Compute the number of cash flow observations by firm	
	g	tag = 1 if cf7_lagat != .
	replace tag = 0 if cf7_lagat == .
	egen n_cf7_lagat = total(tag), by(gvkey)
	drop tag		
	

* Program to compute firm-specific annual cash flow volatilities
	* temp [data#] [volatility varname prefix]
	* We compute volatilities as Acharya et al. (2011)
	capture program drop temp
    program define temp	
	* Compute lags
	sort gvkey fyear
	g		t0		= `1'
	g		t1		= `1'[_n-1] if gvkey[_n]==gvkey[_n-1] & fyear[_n]==fyear[_n-1]+1
	g		t2		= `1'[_n-2] if gvkey[_n]==gvkey[_n-2] & fyear[_n]==fyear[_n-2]+2
	g		t3		= `1'[_n-3] if gvkey[_n]==gvkey[_n-3] & fyear[_n]==fyear[_n-3]+3
	g		t4		= `1'[_n-4] if gvkey[_n]==gvkey[_n-4] & fyear[_n]==fyear[_n-4]+4
	g		t5		= `1'[_n-5] if gvkey[_n]==gvkey[_n-5] & fyear[_n]==fyear[_n-5]+5
	g		t6		= `1'[_n-6] if gvkey[_n]==gvkey[_n-6] & fyear[_n]==fyear[_n-6]+6
	g		t7		= `1'[_n-7] if gvkey[_n]==gvkey[_n-7] & fyear[_n]==fyear[_n-7]+7
	g		t8		= `1'[_n-8] if gvkey[_n]==gvkey[_n-8] & fyear[_n]==fyear[_n-8]+8	
	g		t9		= `1'[_n-9] if gvkey[_n]==gvkey[_n-9] & fyear[_n]==fyear[_n-9]+9	
	
	* Compute standard deviation
	* We require at least 5 observations
	egen	`2'		= rsd(t0 t1 t2 t3 t4 t5 t6 t7 t8 t9) if t0!=. & t1!=. & t2!=.
    drop  t0-t9
    end	
	
	
	temp cf5_at sd_cf5_at
	egen sd_indcf5_at  = mean(sd_cf5_at), by(year sic2)

	temp cf5_lagat sd_cf5_lagat
	egen sd_indcf5_lagat  = mean(sd_cf5_lagat), by(year sic2)	
	
	temp cf7_at sd_cf7_at
	egen sd_indcf7_at  = mean(sd_cf7_at), by(year sic2)
	
	temp cf7_lagat sd_cf7_lagat
	egen sd_indcf7_lagat  = mean(sd_cf7_lagat), by(year sic2)	

	* Keep the sorting variable not winsorized
	g g_cf5_lagat_sort = g_cf5_lagat
	g g_cf7_lagat_sort = g_cf7_lagat	
	
	
********************************		
**** End cash flow variable
********************************	
	
******************************************		
**** Beginn additional things to do/add
******************************************
	
**** Winsorize variables at the 1st and 99th percentiles
	local vlist at l_a market_eq mv l_market_eq l_sale cash_at cash_atnet cash_sales ch_cash_at ch_at ppe_at td_at ltd_at std_at nd_at td_mv nd_mv op_lev cap_lab capex_at capex_cap rd_sale rd_at at_nom/*
	*/			cf5_at cf5_lagat cf7_at  cf7_lagat g_cf5_at g_cf5_at0 g_cf5 g_cf5_lagat g_cf7_at g_cf7_lagat /*
	*/ 			mb mb_excash aqc_at zscore roa at_growth sale_growth nwc_at cash_savings_at net_invest_at net_invest_cap /*
	*/			equity_issue_at net_equity_issue_at net_equity_issue2_at eq_issue_dummy gross_eq_issue_dummy ltdebt_issue_at ltdebt_reduction_at /*
	*/			net_ltdebt_issue_at ltdebt_issue_dummy d_dltt_at d_dlc_at d_td_at dltt_issue_dummy dlc_issue_dummy td_issue_dummy /*
	*/			sd_cf7_at sd_indcf7_at sd_indcf7_lagat sd_cf5_at sd_indcf5_at sd_indcf5_lagat tang1_at tang2_at stock_ret
	
	foreach v of local vlist {
		winsor `v' 1
		}	

**** Create lags and leads of variables
	
	drop atl1 salel1 cf5_atl1 cf5_lagatl1 cf7_atl1 cf7_lagatl1 
	* One period lag
	sort gvkey fyear
	local vlist at l_a market_eq mv l_market_eq l_sale cash_at cash_atnet cash_sales ch_cash_at ch_at ppe_at td_at ltd_at std_at nd_at td_mv nd_mv op_lev cap_lab capex_at capex_cap rd_sale rd_at at_nom /*
	*/			cf5_at cf5_lagat cf7_at  cf7_lagat/*
	*/ 			mb mb_excash aqc_at zscore roa at_growth sale_growth nwc_at cash_savings cash_savings_at net_invest_at net_invest_cap /*
	*/			equity_issue_at net_equity_issue_at net_equity_issue2_at eq_issue_dummy gross_eq_issue_dummy ltdebt_issue_at ltdebt_reduction_at /*
	*/			net_ltdebt_issue_at ltdebt_issue_dummy d_dltt_at d_dlc_at d_td_at dltt_issue_dummy dlc_issue_dummy td_issue_dummy /*
	*/			sd_cf7_at sd_indcf7_at sd_indcf7_lagat sd_cf5_at sd_indcf5_at sd_indcf5_lagat tang1_at tang2_at stock_ret
		
	local vlistl1
	foreach v of local vlist {
		g `v'l1 = `v'[_n-1] if gvkey[_n]==gvkey[_n-1] & fyear[_n]==fyear[_n-1]+1
		local vlistl1 `vlistl1' `v'l1
		}
	
	* One period lead
	sort gvkey year
	local vlist at l_a market_eq mv l_market_eq l_sale cash_at cash_atnet cash_sales ch_cash_at ch_at ppe_at td_at ltd_at std_at nd_at td_mv nd_mv op_lev cap_lab capex_at capex_cap rd_sale rd_at at_nom /*
	*/			cf5_at cf5_lagat cf7_at  cf7_lagat/*
	*/ 			mb mb_excash aqc_at zscore roa at_growth sale_growth nwc_at cash_savings cash_savings_at net_invest_at net_invest_cap /*
	*/			equity_issue_at net_equity_issue_at net_equity_issue2_at eq_issue_dummy gross_eq_issue_dummy ltdebt_issue_at ltdebt_reduction_at /*
	*/			net_ltdebt_issue_at ltdebt_issue_dummy d_dltt_at d_dlc_at d_td_at dltt_issue_dummy dlc_issue_dummy td_issue_dummy /*
	*/			sd_cf7_at sd_indcf7_at sd_indcf7_lagat sd_cf5_at sd_indcf5_at sd_indcf5_lagat tang1_at tang2_at stock_ret
		
	local vlistf1
	foreach v of local vlist {
		g `v'f1 = `v'[_n+1] if gvkey[_n]==gvkey[_n+1] & fyear[_n]==fyear[_n+1]-1
		local vlistf1 `vlistf1' `v'f1
		}
	
	
	* Drop variables we do not need anymore
	drop	 prcc_f capx xrd ib dp oibdp ceq  	/*
				*/ 	ivch aqc fuseo sppe siv ivstch ivaco dv dltis dltr dlcch recch invch apalch txach aoloch fiao /*
				*/	ibc xidoc dpc txdc esubc sppiv fopo fsrco exre
	
	
	**** Merge with fitted Herfindahl-Hirschmann Index data
	sort sic3 year
	merge m:1 sic3 year using "$data/fitted_hhi.dta"
	drop if _merge == 2
	drop _merge
	
	
	**** Merge with credit line data
	sort gvkey year
	merge m:1 gvkey year using "$data\credit_lines.dta"
	drop if _merge == 2
	drop _merge	
	
	replace facilityamt = facilityamt/1000000
	replace facamt_mean = facamt_mean/1000000
	
	g facamt_m = facilityamt
	replace facamt_m = 0 if facamt_m == .
	
	replace facilityamt = facilityamt/gdpdeflator2000
	replace facamt_mean = facamt_mean/gdpdeflator2000
	
	sort gvkey fyear
	g facamt_ml1 = facamt_m[_n-1] if gvkey[_n]==gvkey[_n-1] & fyear[_n]==fyear[_n-1]+1
	g ch_facamt = facamt_m - facamt_ml1
	
	* Dummy for increases in credit line amounts
	g d_cl_increase = 1 if ch_facamt > 0
	replace d_cl_increase = 0 if d_cl_increase == .
	replace d_cl_increase = . if facilityamt == .
	
	
	** Create other credit line variables
	
	* Credit lines to total assets
	g creditline_at = facilityamt/atl1	
	g creditline_at_issue = facilityamt/atl1
	replace creditline_at_issue = 0 if nb_creditline_issue == 0
	
	winsor creditline_at 1
	winsor creditline_at_issue 1
	
	g maturity_issue = maturity
	replace maturity_issue = . if nb_creditline_issue == 0

	g allindrawn_issue = allindrawn
	replace allindrawn_issue = . if nb_creditline_issue == 0	
	
	* Credit lines to total debt	
	g creditline_td = facilityamt/td_nom
	winsor creditline_td 1
	
	g clmean_td = facamt_mean/td_nom
	winsor clmean_td 1
	
	g l_nb_creditline_out = ln(1+nb_creditline_out)
	g l_creditlineamt = ln(1+facilityamt)
	
	g creditline_dummy = 1 if nb_creditline_issue >0
	replace creditline_dummy = 0 if nb_creditline_issue == 0
	replace creditline_dummy = . if nb_creditline_issue == .
	
	g l_nb_creditline_issue = ln(1+nb_creditline_issue)	
	
	g l_maturity = ln(1 + maturity)
	g l_loanspread = ln(allindrawn)
			
		
	**** Merge with bond data from FISD
	g str6 cusip_issuer = cusip
	
	sort cusip_issuer year
	merge m:1 cusip_issuer year using "$data\bonds.dta"
	drop if _merge == 2
	drop _merge			
		
	replace bond_amount = bond_amount/1000	
		
	replace bond_amount = bond_amount/gdpdeflator2000
		
	* Bonds to total assets
	g bonds_at = bond_amount/atl1
	g bonds_at_issue = bond_amount/atl1
	replace bonds_at_issue = 0 if nb_bond_issue == 0		
	
	winsor bonds_at 1		
	winsor bonds_at_issue 1				
	
	g bond_maturity_issue = bond_maturity
	replace bond_maturity_issue = . if nb_bond_issue == 0

	g bond_spread_issue = bond_spread
	replace bond_spread_issue = . if nb_bond_issue == 0		
	
	
	g l_nb_bond_out = ln(1+nb_bond_out)
	g l_bondamt = ln(1+bond_amount)
	
	g bond_dummy = 1 if nb_bond_issue >0
	replace bond_dummy = 0 if nb_bond_issue == 0
	replace bond_dummy = . if nb_bond_issue == .
	
	g l_nb_bond_issue = ln(1+nb_bond_issue)			
		
		
	**** Merge with CRSP stock return volatility data
	rename cusip cusip6
	g str8 cusip = cusip6 
	
	sort cusip year
	merge m:1 cusip year using "$data\crsp_vol.dta" 
	drop if _merge == 2
	drop _merge
	
	g naive_D = td
	g debt_vol 		= 0.05 + 0.25*equity_vol
	g asset_vol 	= (market_eq/(market_eq+naive_D))*equity_vol + (naive_D/(market_eq+naive_D))*debt_vol
	
	g dd 		= ((ln(market_eq+naive_D)/naive_D) + (stock_ret - 0.5*asset_vol^2))/asset_vol
	g dp 		= normal(-dd)
	
******************************************		
**** End additional things to do/add
******************************************	
	
**** Save the dataset 
	sort gvkey fyear
	
	save "$data\GMMSV_sample.dta", replace	
		
************************************************************
**** End: Extract and Format of annual Compustat data ****
************************************************************


*********************************************
**** Begin: Sorting and Grouping of data ****
*********************************************
	use "$data3\GMMSV_sample.dta", clear
	
	global finalcats "sic3"
	
* 1. Choose filter and sorting parameters
* 	 Choose the three parameters below (year, number of cash flows, number of firms per bin)
	 scalar mincfobs = 10    
	 scalar minfperbin = 10 
	 scalar minfirmsresid = 5
	 scalar y0 = 1971
	 scalar maxquants = 45
	
* 2. Apply filters 	
	 keep if fyear >= y0
	 
	 keep if n_cf5_at >= mincfobs
	 
* 3. Generate new categories based on a {SIC} x {quantile of var} arrangement 
** 3.1 Determine a list of variables to use as sorting criteria within SIC

**** We define the sorting variable

	**** The main cash flow measure is cf5 (CF = EBITDA - dWC)
	drop g_cf5_lagat
	g g_cf5_lagat = g_cf5_lagat_sort
	global sortvars = "g_cf5_lagat"	

	**** Alternatively, the sort can be done using cf7 (CF = EBITDA)
	* Using the defintion cf7 will produce the data to obtain the estimates
	* summarized in Table 11 of the paper
	*drop g_cf7_lagat
	*g g_cf7_lagat = g_cf7_lagat_sort
	*global sortvars = "g_cf7_lagat"	
		
	
** 3.2 Loop over varlist
	foreach vsort of varlist $sortvars {
	* For given sorting variable, calculate number of usable firms 
	bysort gvkey: egen varmean_gvk   = mean(`vsort')
	bysort gvkey: egen firstobs_gvk  =  min(fyear)  if `vsort' ~= .
	
	* Compute number of firms per sic with non-missing values
	bysort sic3: egen n_varsic3 = count(firstobs_gvk) if fyear == firstobs_gvk & firstobs_gvk ~= .
	* Define number of bins to split into
	gen n_bins3 = min(maxquants,int(n_varsic3 / minfperbin)) if n_varsic3 ~= .	
	
	* Define quartile sorting variables (default: only one bin per sic)
	gen `vsort'_qsic3 = 1

	* Generate quantile groups by sic (need to loop as number of quantiles changes! Pain.)
	levelsof sic3, local(levs3)
	foreach s of local levs3 {
		* Drop firms with no available sorting var observations IF the industry can otherwise be sorted into at least two groups
	    egen tempcount = mean(n_bins3) if sic3 == `s'
		drop if varmean_gvk == . & tempcount >= 2
		drop tempcount
		
		* Sort 'sortable' industries
		quietly summ n_bins3 if sic3 == `s'
			scalar obs = `r(N)' 
			if obs > 2*minfperbin {
				local nb = `r(max)'
				disp `nb'
				xtile tempq = varmean_gvk if fyear == firstobs_gvk & sic3 == `s', nq(`nb')
				replace `vsort'_qsic3 = tempq if tempq ~ = .
				drop tempq
			}
		}
		
		
	* Average sort bin by gvkey
	bysort gvkey: egen `vsort'_qwithsic3 = max(`vsort'_qsic3)
	* Generate cartesian product (and number of categories)
	egen sic3_by_`vsort' = group(sic3 `vsort'_qwithsic3)	
	
	
	* Drop variables
	drop varmean_gvk firstobs_gvk n_varsic3 n_bins3 
	drop `vsort'_qsic3 `vsort'_qwithsic3 
	global finalcats "$finalcats sic3_by_`vsort'"
	}	
	
	* Must save before reshape 
	save "$data/CFdata_temp.dta", replace	
	
	use "$data/CFdata_temp.dta", clear	

	* 4. We need to reshape the data to format for Loriano
	keep gvkey fyear cf5_at0 $finalcats
	
	reshape wide cf5_at0, i(gvkey) j(fyear)
	
	* 5. Screen and summarize (can keep or drop)
	egen n_sic3 = count(gvkey), by(sic3)
	
	egen n_sic3_by_g_cf5_lagat = count(gvkey), by(sic3_by_g_cf5_lagat)
	drop if n_sic3_by_g_cf5_lagat < minfirmsresid

	egen cfusable = rownonmiss(cf5_at01971-cf5_at02018)	
	
	 foreach catvar of varlist $finalcats {
		egen nfirms_`catvar'     = count(gvkey), by(`catvar')
		egen ncashflows_`catvar' = sum(cfusable), by(`catvar')
		egen groupid = group(`catvar')
		egen ncats_`catvar'      = max(groupid)
		drop groupid
		gen  ncfperfirm_`catvar' = ncashflows_`catvar' / nfirms_`catvar'
	}
	
	* 6. Save
	keep gvkey sic3 n_sic3 sic3_by_g_cf* n_sic3_by_g_cf* cf*
	
	sort sic3_by_g_cf gvkey
	browse gvkey sic3 sic3_by_g_cf* n_sic3 n_sic3_by_g_cf* cf*
	* Copy-paste the data on viewer to excel or directly to Matlab sheet

	save "$data/GMMSV_CFdata_estimation.dta", replace
	
	
*******************************************
**** End: Sorting and Grouping of data ****
*******************************************
	
	
	
